#!/usr/bin/php
<?php

require 'vendor/autoload.php';

use \Goutte\Client;
use Symfony\Component\DomCrawler\Crawler;

const sources = [
    'https://notabug.org/diogo/gnu-social' => 'notabug',
    // 'https://bugz.foocorp.net/project/view/3/' => 'phabricator',
    'https://bugz.foocorp.net/maniphest/query/clvSMla6p5o3' => 'phabricator',
    'https://git.gnu.io/gnu/gnu-social' => 'gitlab',
    'https://github.com/chimo?tab=repositories&q=gs' => 'github-repo_list'
];


$issues = [];

function getCachedCrawler(Client &$client, string $url, string $type, string $section, int $num): ?Crawler {
    $file = "./cache/{$type}/{$section}/{$num}";

    if (!file_exists($file)) {
        // HTTP request
        echo "Requesting {$url}\n";
        $content = @file_get_contents($url);
        if ($content === false) {
            return null;
        }
        $dir = dirname($file);
        if (!file_exists($dir)) {
            mkdir($dir, 0777, $_recursive = true);
        }
        file_put_contents($file, $content);
    } else {
        // Cached file
        echo "Retrieving cached URI {$url}\n";
        $content = file_get_contents($file);
    }

    return $client->request('GET', $url, $_parameters = [], $_files = [], $_server = [], $content);
}

function getAllIssues(Client &$client, array $sources, string $type): array {
    $pages = [];
    $users = [];
    $issues = [];

    foreach ($sources as $source => $type) {
        switch ($type) {
        case 'notabug':
            if ($section = 'issues') {
            // foreach (['issues', 'pulls'] as $section) {
                for ($i = 1; ; ++$i) {
                    if ($i == 24) {
                        // For some reason 24 always returns 500
                        continue;
                    }

                    $host = parse_url($source, PHP_URL_HOST);
                    $url = "{$source}/{$section}/{$i}";
                    $path = "./issues/{$host}/{$i}";
                    if (file_exists($path)) {
                        echo "Skipping {$url}\n";
                        continue;
                    }

                    $crawler = getCachedCrawler($client, $url, $type, $section, $i);

                    if (!is_null($crawler)) {
                        $pages[$url] = "{$type}-{$section}";
                    } else {
                        break;
                    }

                    if ($crawler->getUri() !== $url) {
                        // Redirect from issue to pull, ignore
                        continue;
                    }

                    $header = $crawler->filter('div.title');
                    $title = $crawler->filter('#issue-title')->text();
                    $status = strtolower($crawler->filter('div.label')->text());
                    $time = $crawler->filter('span.time-since')->attr('title');
                    $author = $host . $crawler->filter('span.time-desc > a')->attr('href');

                    $comments = [];
                    $crawl_comments = $crawler->filter('ui.comments div.comment');
                    $crawl_comments->each(function ($comment, $i) use ($source, $host, $url, &$users, &$comments) {
                        $header = $comment->filter('div.header > span');
                        $user = $host . $header->first()->attr('href');
                        $users[] = $user;
                        $time = $header->filter('a:last-child > span')->attr('title');
                        $content = $comment->filter('div.raw-content')->text();
                        $comments[] = ['user' => $user, 'time' => $time, 'content' => $content];
                    });

                    $issues[] = [
                        'title'    => $title,
                        'author'   => $author,
                        'number'   => "{$host}/{$i}",
                        'time'     => $time,
                        'status'   => $status,
                        'comments' => $comments
                    ];
                }
            }
            break;
        // case 'phabricator':
        //     $crawler = getCachedCrawler($client, $source, $type, 'issues');
        //     $pages[$source] = "{$type}-issues";
        //     break;
        default:
        }
    }

    // Dump all users to the `user` folder
    foreach ($users as $user) {
        $path = './user/' . $user;
        if (!file_exists($path)) {
            mkdir(dirname($path), 0777, $_recursive = true);
            touch($path);
        }
    }

    return $issues;
}


$client = new Client();
$issues = getAllIssues($client, sources, '');

foreach ($issues as $issue) {
    $path = './issues/' . $issue['number'];
    if (!file_exists(dirname($path))) {
        mkdir(dirname($path), 0777, $_recursive = true);
    }

    $content =
"---
title: {$issue['title']}
author: {$issue['author']}
time: {$issue['time']}
status: {$issue['status']}
---
";
    foreach ($issue['comments'] as $comment) {
        $content .= "
author: {$comment['user']}
time: {$comment['time']}
content: -----
{$comment['content']}
-----
";
    }

    file_put_contents($path, $content);
}
